Homepage
The formatted source code for this file is here.
And a raw version here.
Previous work by Youngser Park can be found here.
On Fri, Dec 11, 2015 at 11:53 AM, joshua vogelstein jovo@jhu.edu wrote:
we will get n=10^6 points, each in d=25 dimensions.
i want to hierarchically cluster them, in a ways:
This corresponds to 24 channels x 6 features per synapse, ordered like c0f0,c0f1,c0f2,c0f3,c0f4,c0f5,c1f0,c1f1… etc
f0 = integrated brightness
f1 = local brightness
f2 = distance to Center of Mass
f3 = moment of inertia around synapsin maxima
f4,f5 are features that I forget what they are.. would need to ask brad.
i would throw them out, I did so in my kohonen code (which you have, its in matlab).
and
On Feb 8, 2016, at 2:00 PM, Kristina Micheva kmicheva@stanford.edu wrote:
and
On March 10, 2016, 00:29:04 (UTC), Kristina Micheva kmicheva@stanford.edu wrote:
There are 2 different Synap channels (2 different antibodies were used), so that part is fine. And 2 different VGluT1 channels (same antibody but done at different times) The NOS channel is the same, so count it as one even though it appears twice. It is listed two times because it can be found at both excitatory and inhibitory synapses. This is where your count of 25 comes, even though there are 24 channels. I would also add the 2 Synap channels to the Inhibitory presynaptic category - there is supposed to be synapsin there, but at lower levels compared to excitatory presynaptic category.
227 in the kohenen.m file which can be found in the dropbox folder.Synap and Synap have been augmented to Synap_1 and Synap_2 for clarity.VGlut1 and VGlut1 have been augmented to VGlut1_t1 and VGlut1_t2 to distinguish between the different times of collection (which are unknown).feat <- fread("../Data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE)
dim(feat)# [1] 1119299 144
channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3',
'psd','glur2','nmdar1','nr2b','gad','VGAT',
'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A',
'NOS','TH','VACht','Synapo','tubuli','DAPI')
channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
'in.pre','in.post','in.post','in.post','in.pre.small','other',
'ex.post','other','other','ex.post','none','none')
nchannel <- length(channel)
nfeat <- ncol(feat) / nchannel
ffchannel <- (factor(channel.type,
levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
))
fchannel <- as.numeric(factor(channel.type,
levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
))
ford <- order(fchannel)
Syncol <- c("#197300","#5ed155","#660000","#cc0000","#ff9933","mediumblue","gold")
ccol <- Syncol[fchannel]
exType <- factor(c(rep("ex",11),rep("in",6),rep("other",7)),ordered=TRUE)
exCol<-exType;levels(exCol) <- c("#197300","#990000","mediumblue");
exCol <- as.character(exCol)
fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5))))
names(feat) <- fname
fcol <- rep(ccol, each=6)
mycol <- colorpanel(100, "purple", "black", "green")
mycol2 <- matlab.like(nchannel)We will consider only the f0 (integrated brightness) features, and will transform the raw data by scaling and filtering.
f0 <- seq(1,ncol(feat),by=nfeat)
featF0 <- subset(feat, select=f0)
f01e3 <- 1e3*data.table(apply(X=featF0, 2, function(x){((x-min(x))/(max(x)-min(x)))}))
fs <- f01e3
### Taking log_10 on data + 1.
log1f <- log10(featF0 + 1)
slog1f <- scale(log1f, center=TRUE,scale=TRUE)We now have the following data sets:
featF0: Raw data looking only at the integrated brightness features.fs: Raw data scaled between \([0,1000]\).slog1f: The raw data, plus one, \(log_{10}\), then scaled by subtractng mean and dividing by statndard deviation.df <- melt(as.matrix(log1f))
names(df) <- c("ind","channel","value")
df$type <- factor(rep(ffchannel,each=dim(fs)[1]),levels=levels(ffchannel))
lvo <- c(1:5,7:10,19,22,11:16,6,17,18,20,21,23,24)
levels(df$channel)<-levels(df$channel)[lvo]
ts <- 22
gg1 <- ggplot(df, aes(x=value)) +
scale_color_manual(values=ccol[lvo]) +
scale_fill_manual(values=ccol[lvo]) +
geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=100) +
geom_density(aes(group=channel, color=channel),size=1.5) +
facet_wrap( ~ channel, scale='free', ncol=6) +
theme(plot.title=element_text(size=ts),
axis.title.x=element_text(size=ts),
axis.title.y=element_text(size=ts),
legend.title=element_text(size=ts),
legend.text=element_text(size=ts-2),
axis.text=element_text(size=ts-2),
strip.text=element_text(size=ts),
legend.position='none')+
ggtitle("Kernel Density Estimates of `log1f` data.")
print(gg1)log data.tmp <- as.numeric(table(fchannel))
cmatslog1f <- cor(slog1f)
corrplot(cmatslog1f[ford,ford],method="color",tl.col=ccol[ford], tl.cex=0.8)
corrRect(tmp,col=Syncol,lwd=4)pcaLog <- prcomp(cmatslog1f,scale=TRUE, center=TRUE)
elLog <- getElbows(pcaLog$sdev, plot=FALSE) We run K-means for \(K=3\) on the PCA embedded in \(\mathbb{R}^3\) of the correlation matrix.
K1 <- c(3) ## The set of K's.
## Run kmeans on the untransformed data
kvecCor <- foreach(i = K1) %dopar% {
set.seed(2^4 - 1)
kmeans(pcaLog$x[,1:3],centers=i)
}par(mfrow=c(1,2))
plot(pcaLog$x[,1:2],
col=ccol,
pch=as.numeric(exType)+15,
cex=1.5,
xlim=c(min(pcaLog$x[,1])-0.2,max(pcaLog$x[,1])+0.7),
main="Embedding of PCA log_10 correlation data")
text(pcaLog$x[,1:2],col=ccol,label=abbreviate(channel),offset=1, pos=4)
plot(pcaLog$x[,1:2],
col=ccol,
pch=as.numeric(kvecCor[[1]]$cluster)+14,
cex=1.5,
xlim=c(min(pcaLog$x[,1])-0.2,max(pcaLog$x[,1])+0.7),
main="Embedding of PCA log_10 correlation data \ncolred according to truth and \nshape based on clustering from 3-means")
text(pcaLog$x[,1:2],col=ccol,label=abbreviate(channel),offset=1, pos=4)pca <- pcaLog$x[,1:3]
rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s',col=ccol, size=1, main="Log")
rgl::rgl.texts(pca[,1],pca[,2],pca[,3],abbreviate(channel), col=ccol, adj=c(0,1.5))
subid <- currentSubscene3d()
rglwidget(elementId="plot3dLog")Next we run K-means with \(K=3\).
** Note that a seed is being set for the random initialization of K-means. **
K2 <- c(2) ## The set of K's.
## Run kmeans on the untransformed data
kvecslog1f <- foreach(i = K2) %dopar% {
set.seed(2^4 - 1)
kmeans(slog1f,centers=i)
}For the following we manualy choose 2 clusters.
## Formatting data for heatmap
aggslog1f <- aggregate(slog1f,by=list(lab=kvecslog1f[[1]]$cluster),FUN=mean)
aggslog1f <- as.matrix(aggslog1f[,-1])
rownames(aggslog1f) <- clusterFraction(kvecslog1f[[1]])
ford <- order(fchannel)heatmap.2(as.matrix(aggslog1f[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1.25,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90,main="Heatmap of `slog1f` data.") # [1] "#197300" "#197300" "#197300" "#197300" "#197300"
# [6] "#5ed155" "#5ed155" "#5ed155" "#5ed155" "#5ed155"
# [11] "#5ed155" "#660000" "#660000" "#660000" "#cc0000"
# [16] "#cc0000" "#cc0000" "#ff9933" "#ff9933" "mediumblue"
# [21] "mediumblue" "mediumblue" "gold" "gold"
Percentage of data within cluster is presented on the right side of the heatmap.
GABABR### Sampling to reduce size
set.seed(2^13 - 2)
s1 <- sample(dim(slog1f)[1],2.5e5)
dlog1f <- data.table(log1f[s1,])
## re-formatting data for use in lattice
dlog1f2 <- data.table(stack(dlog1f, select=-GABABRF0))[,.(values)]
dlog1f2$GABABR <- dlog1f$GABABRF0
### Adding relationship factor variables
nd <- paste0("GABABR","~",abbreviate(channel[-16]))
dlog1f2$ind <- factor(rep(nd,each=dim(dlog1f)[1]), ordered=TRUE,levels=nd)
head(dlog1f2$ind)# [1] GABABR~Sy_1 GABABR~Sy_1 GABABR~Sy_1 GABABR~Sy_1 GABABR~Sy_1 GABABR~Sy_1
# 23 Levels: GABABR~Sy_1 < GABABR~Sy_2 < GABABR~VG1_1 < ... < GABABR~DAPI
names(dlog1f2) <- c("x","y","g")
rg1 <- xyplot(y ~ x | g, data=dlog1f2,
as.table=TRUE,
colramp=BTC,
pch='.',
scales = list(y = list(relation = "free"),x = list(relation = "free")),
panel=function(x,y,...){
panel.hexbinplot(x,y,...)
panel.loess(x,y,col='red', lwd=2,...)
}
)GABABRHere we consider the channel types to be the “ground truth” and computer the Adjusted Rand Index of between that and the output from k-means.
levels(ffchannel) <- c(rep("ex", 2), rep("in", 2), rep("other", 3))
levels(ffchannel)# [1] "ex" "in" "other"
truth <- as.numeric(ffchannel)Makeing a data.table of the permutation data for ggplot.
DT <- data.table(avd(pcaLog, elLog[2], truth),key='Embedding_Dimension')
DT <- DT[,pval := sum(permARI>=ari)/length(permARI),by=Embedding_Dimension]
ua <- DT[,unique(ari),by=Embedding_Dimension]
arid <- data.frame(Embedding_Dimension=as.numeric(ua$Emb),
ARI=ua$V1,
pval=DT[,unique(pval),by=Embedding_Dimension]$V1)gg3 <- ggplot(data=DT,aes(x=permARI, y=..density..,color=Embedding_Dimension,label=pval)) +
#geom_histogram(binwidth=3.49*sd(DT$permARI)*length(DT$permARI)^(-1/3)) +
geom_histogram(bins=25)+
geom_vline(aes(xintercept=ari),colour='darkred',size=1.2)+
#geom_text(aes(x=(ari-ari/2),y=7))+
theme(axis.text=element_text(size=18),
title=element_text(size=16),
strip.text.x=element_text(size=16)) +
facet_wrap(~Embedding_Dimension+pval,scale='free',labeller=label_both)
print(gg3)gg5 <- ggplot(data=arid,aes(x=Embedding_Dimension,y=ARI,label=pval,colour=pval)) +
geom_line(size=1.5,colour='salmon') + geom_point(size=3) +
#geom_text(hjust='right',vjust='top',nudge_x=0.5,nudge_y=0.01,size=5)+
theme(axis.text=element_text(size=18),
title=element_text(size=16)) +
ggtitle("ARI vs. DIM with estimated p-values")
gg6 <-
ggplot(data=arid, aes(x=Embedding_Dimension, y=pval, colour=pval))+
geom_line(size=1.5, colour='salmon') +
geom_point(size=3) +
# geom_hline(yintercept=0.05,
# colour='forestgreen',
# size=1.5) +
scale_y_log10(breaks=c(1e-4,1.53-4,1e-3), na.value=0) +
theme(axis.text=element_text(size=18),
title=element_text(size=16)) +
ggtitle("P-values")
grid.arrange(gg5,gg6,nrow=2)